{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "from pathlib import Path\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "data_path = Path('/home/bnu/projects/CCKS2020-Entity-Linking/data')\n",
    "raw_path = data_path/'ccks2020_el_data_v1'\n",
    "pickle_path = data_path/'pickle'\n",
    "result_path = data_path/'result'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Person',\n",
       " 'Work',\n",
       " 'Medicine',\n",
       " 'Game',\n",
       " 'Other',\n",
       " 'Organization',\n",
       " 'Location',\n",
       " 'Culture',\n",
       " 'Biological',\n",
       " 'VirtualThings',\n",
       " 'Natural&Geography',\n",
       " 'Website',\n",
       " 'Event',\n",
       " 'Brand',\n",
       " 'Food',\n",
       " 'Awards',\n",
       " 'Time&Calendar',\n",
       " 'Disease&Symptom',\n",
       " 'Software',\n",
       " 'Vehicle',\n",
       " 'Education',\n",
       " 'Constellation',\n",
       " 'Diagnosis&Treatment',\n",
       " 'Law&Regulation']"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kbid_to_entities = pd.read_pickle(pickle_path/'kbid_to_entities.pkl')\n",
    "entity_to_kbids = pd.read_pickle(pickle_path/'entity_to_kbids.pkl')\n",
    "idx_to_type = pd.read_pickle(pickle_path/'idx_to_type.pkl')\n",
    "idx_to_type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = []\n",
    "with open(raw_path/'dev.json', 'r') as f:\n",
    "    for i, line in enumerate(f):\n",
    "        temp = json.loads(line)\n",
    "        for i, data in enumerate(temp['mention_data']):\n",
    "            if data['mention'] in entity_to_kbids:\n",
    "#                 print(data['mention'], entity_to_kbids[data['mention']])\n",
    "                data['kb_id'] = random.choice(list(entity_to_kbids[data['mention']]))\n",
    "            else:\n",
    "                data['kb_id'] = 'NIL_' + random.choice(idx_to_type)\n",
    "        result.append(temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(result_path/'result.json', 'w') as f:\n",
    "    for r in result:\n",
    "        json.dump(r, f, ensure_ascii=False)\n",
    "        f.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
